###########################################################
# Author:     Jan Schwalbach 
# Date:       June 2024
# Project:    Mind the Context - The Role of Theoretical 
#             Concepts for Analyzing Legislative Text Data 
# Task:       Replication Script for all analyses in the 
#             article published in Research & Politics
###########################################################

# Loading relevant packages (all of these need to be installed)

library(ggplot2)
library(ggpubr)
library(dplyr)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(wordshoal)

# Loading the two corpora - Corpora need to be in the same WD as the R script
# and WD should be set to source file location

Corpus_speeches_germany <- readRDS('./Corpus_speeches_germany.RDS')
Corpus_speeches_spain <- readRDS('./Corpus_speeches_spain.RDS')

# Filtering only the 2 legislative periods for both countries and transforming 
# some variables for the analyses

period_19 <- Corpus_speeches_germany[Corpus_speeches_germany$date >= "2017-10-24",]
period_18 <- Corpus_speeches_germany[Corpus_speeches_germany$date < "2017-10-24",]
period_18 <- period_18[period_18$date >= "2013-09-22",]

Corpus_speeches_spain$text <- Corpus_speeches_spain$text_nointeruption
Corpus_speeches_spain$procedure_type <- Corpus_speeches_spain$speech_procedure_ID
Corpus_speeches_spain$procedure_type <- gsub("/.*","",Corpus_speeches_spain$procedure_type)
table(Corpus_speeches_spain$procedure_type, useNA="always")

period_10 <- Corpus_speeches_spain[Corpus_speeches_spain$period == 10,]
period_14 <- Corpus_speeches_spain[Corpus_speeches_spain$period == 14,]

table(period_19$party, useNA="always")
table(period_18$party, useNA="always")
table(period_10$party, useNA="always")
table(period_14$party, useNA="always")

# Share of chair speeches

length(period_10$party[period_10$party == "-"]) / length(period_10$party)
length(period_14$party[period_14$party == "-"]) / length(period_14$party)
length(period_18$party[is.na(period_18$party)]) / length(period_18$party)
length(period_19$party[is.na(period_19$party)]) / length(period_19$party)

# Excluding speeches by the chair

period_10 <- period_10[!period_10$party == "-",]
period_14 <- period_14[!period_14$party == "-",]
period_18 <- period_18[!is.na(period_18$party),]
period_19 <- period_19[!is.na(period_19$party),]

### Looking at Speeches per debates

period_10$agenda_unique <- paste(period_10$agenda,period_10$date)
period_14$agenda_unique <- paste(period_14$agenda,period_14$date)
period_18$agenda_unique <- paste(period_18$debate,period_18$date)
period_19$agenda_unique <- paste(period_19$debate,period_19$date)

period_10_debate_speeches <- as.data.frame(table(period_10$agenda_unique))
period_14_debate_speeches <- as.data.frame(table(period_14$agenda_unique))
period_18_debate_speeches <- as.data.frame(table(period_18$agenda_unique))
period_19_debate_speeches <- as.data.frame(table(period_19$agenda_unique))

# Plotting Figure A1

plot_a <- ggplot(period_10_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Spain 11-15)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))
  
plot_b <- ggplot(period_14_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Spain 20-23)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

plot_c <- ggplot(period_18_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Germany 13-17)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

plot_d <- ggplot(period_19_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Germany 17-21)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

jpeg("Figure_A1.jpg", width = 1500, height = "1500")
ggarrange(plot_a,plot_b,plot_c,plot_d)
dev.off()

# Calculating median and max frequency for all speeches per period

median(period_10_debate_speeches$Freq)
median(period_14_debate_speeches$Freq)
median(period_18_debate_speeches$Freq)
median(period_19_debate_speeches$Freq)

max(period_10_debate_speeches$Freq)
max(period_14_debate_speeches$Freq)
max(period_18_debate_speeches$Freq)
max(period_19_debate_speeches$Freq)

# Selecting only legislative Speeches

period_10_leg <- period_10[!is.na(period_10$speech_procedure_ID),]
period_14_leg <- period_14[!is.na(period_14$speech_procedure_ID),]
period_18_leg <- period_18[!period_18$speech_procedure_ID == "",]
period_19_leg <- period_19[!period_19$speech_procedure_ID == "",]

period_10_leg$agenda_unique <- paste(period_10_leg$agenda,period_10_leg$date)
period_14_leg$agenda_unique <- paste(period_14_leg$agenda,period_14_leg$date)
period_18_leg$agenda_unique <- paste(period_18_leg$debate,period_18_leg$date)
period_19_leg$agenda_unique <- paste(period_19_leg$debate,period_19_leg$date)

period_10_leg_debate_speeches <- as.data.frame(table(period_10_leg$agenda_unique))
period_14_leg_debate_speeches <- as.data.frame(table(period_14_leg$agenda_unique))
period_18_leg_debate_speeches <- as.data.frame(table(period_18_leg$agenda_unique))
period_19_leg_debate_speeches <- as.data.frame(table(period_19_leg$agenda_unique))

# Plotting Figure A2

plot_a <- ggplot(period_10_leg_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Spain 11-15)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

plot_b <- ggplot(period_14_leg_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Spain 20-23)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

plot_c <- ggplot(period_18_leg_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Germany 13-17)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

plot_d <- ggplot(period_19_leg_debate_speeches, aes(Freq)) +
  geom_histogram(bins=50)+ theme_bw()+labs(
    title = "Speeches per Debate (Germany 17-21)",
    y = "Frequency", x = "Number of Speeches"
  )+ theme(text=element_text(size=30))

getwd()
jpeg("Figure_A2.jpg", width = 1500, height = "1500")
ggarrange(plot_a,plot_b,plot_c,plot_d)
dev.off()

# Producing Table A1  (first column) 
# Debate participation per party for period 10 in Spain

period_10_party <- period_10[!period_10$party == "-",]

period_10_party <- period_10_party %>%
  group_by(agenda) 

period_10_party_summary <- period_10_party %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(agenda)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_10_party$agenda) * 100, 2))

# Filtering only parties that participated in 75% and 50% of all debates
# for period 10 in Spain

period_10_75 <- period_10 %>%
  filter(party %in% c("PP"))

period_10_50 <- period_10 %>%
  filter(party %in% c("PP","PSOE"))

# Producing Table A2 (first column) 
# Debate participation per party for period 14 in Spain

period_14_party <- period_14[!period_14$party == "-",]

period_14_party <- period_14_party %>%
  group_by(agenda)

period_14_party_summary <- period_14_party %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(agenda)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_14_party$agenda) * 100, 2))

# Filtering only parties that participated in 75% and 50% of all debates
# for period 14 in Spain

period_14_75 <- period_14 %>%
  filter(party %in% c("PSOE"))

period_14_50 <- period_14 %>%
  filter(party %in% c("PP","PSOE","Vox"))

# Producing Table A3 (first column) 
# Debate participation per party for period 18 in Germany

period_18_party <- period_18[!is.na(period_18$party),]

period_18_party <- period_18_party %>%
  group_by(debate) 

period_18_party_summary <- period_18_party %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(debate)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_18_party$debate) * 100, 2))

# Filtering only parties that participated in 75% and 50% of all debates
# for period 18 in Germany

period_18_75 <- period_18 %>%
  filter(!party %in% c("fraktionslos"))

period_18_50 <- period_18 %>%
  filter(!party %in% c("fraktionslos"))

# Producing Table A4 (first column) 
# Debate participation per party for period 19 in Germany

period_19_party <- period_19[!is.na(period_19$party),]

period_19_party <- period_19_party %>%
  group_by(debate)

period_19_party_summary <- period_19_party %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(debate)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_19_party$debate) * 100, 2))

# Filtering only parties that participated in 75% and 50% of all debates
# for period 19 in Germany

period_19_75 <- period_19 %>%
  filter(!party %in% c("fraktionslos"))

period_19_50 <- period_19 %>%
  filter(!party %in% c("fraktionslos"))

## Filtering the corpora for debates with more than 10 / 5 speeches

period_10 <- period_10[!period_10$party == "-",]
period_14 <- period_14[!period_14$party == "-",]
period_18 <- period_18[!is.na(period_18$party),]
period_19 <- period_19[!is.na(period_19$party),]

period_19_D10 <- period_19 %>%
  group_by(debate) %>%
  filter(n() > 10)

period_19_D5 <- period_19 %>%
  group_by(debate) %>%
  filter(n() > 5)

period_18_D10 <- period_18 %>%
  group_by(debate) %>%
  filter(n() > 10)

period_18_D5 <- period_18 %>%
  group_by(debate) %>%
  filter(n() > 5)

period_14_D10 <- period_14 %>%
  group_by(agenda) %>%
  filter(n() > 10)

period_14_D5 <- period_14 %>%
  group_by(agenda) %>%
  filter(n() > 5)

period_10_D10 <- period_10 %>%
  group_by(agenda) %>%
  filter(n() > 10)

period_10_D5 <- period_10 %>%
  group_by(agenda) %>%
  filter(n() > 5)

# Producing Table A1  (second column) 
# Debate participation per party for period 10 in Spain
# only for debates with > 5 speeches

period_10_party_2 <- period_10_D5[!period_10_D5$party == "-",]

period_10_party_2 <- period_10_party_2 %>%
  group_by(agenda) 

period_10_party_2_summary <- period_10_party_2 %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(agenda)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_10_party_2$agenda) * 100, 2))

# Filtering only parties that participated in 75% and 50% of the >5 speeches debates
# for period 10 in Spain

period_10_75_D5 <- period_10_D5 %>%
  filter(party %in% c("PP","PSOE","Convergència i Unió","Unión Progreso y Democracia"))

period_10_50_D5 <- period_10_D5 %>%
  filter(party %in% c("PP","PSOE","Convergència i Unió","Unión Progreso y Democracia","Eusko Alderdi Jeltzalea - Partido Nacionalista Vasco","IU"))

# Producing Table A2  (second column) 
# Debate participation per party for period 14 in Spain
# only for debates with > 5 speeches

period_14_party_2 <- period_14_D5[!period_14_D5$party == "-",]

period_14_party_2 <- period_14_party_2 %>%
  group_by(agenda)

period_14_party_2_summary <- period_14_party_2 %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(agenda)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_14_party_2$agenda) * 100, 2))

# Filtering only parties that participated in 75% and 50% of the >5 speeches debates
# for period 14 in Spain

period_14_75_D5 <- period_14_D5 %>%
  filter(party %in% c("PP","PSOE","Vox","PODEMOS","Ciudadanos-Partido de la Ciudadanía"))

period_14_50_D5 <- period_14_D5 %>%
  filter(party %in% c("PP","PSOE","Vox","PODEMOS","Ciudadanos-Partido de la Ciudadanía","Eusko Alderdi Jeltzalea - Partido Nacionalista Vasco","Esquerra Republicana de Catalunya","Junts per Catalunya-Junts"))

# Producing Table A3  (second column) 
# Debate participation per party for period 18 in Germany
# only for debates with > 5 speeches

period_18_party_2 <- period_18_D5[!is.na(period_18_D5$party),]

period_18_party_2 <- period_18_party_2 %>%
  group_by(debate) 

period_18_party_2_summary <- period_18_party_2 %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(debate)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_18_party_2$debate) * 100, 2))

# Filtering only parties that participated in 75% and 50% of the >5 speeches debates
# for period 18 in Germany

period_18_75_D5 <- period_18_D5 %>%
  filter(!party %in% c("fraktionslos"))

period_18_50_D5 <- period_18_D5 %>%
  filter(!party %in% c("fraktionslos"))

# Producing Table A4  (second column) 
# Debate participation per party for period 19 in Germany
# only for debates with > 5 speeches

period_19_party_2 <- period_19[!is.na(period_19$party),]

period_19_party_2 <- period_19_party_2 %>%
  group_by(debate)

period_19_party_2_summary <- period_19_party_2 %>%
  group_by(party) %>%
  summarize(num_combinations = n_distinct(debate)) %>%
  mutate(percent = round(num_combinations / n_distinct(period_19_party_2$debate) * 100, 2))

# Filtering only parties that participated in 75% and 50% of the >5 speeches debates
# for period 19 in Germany

period_19_75_D5 <- period_19_D5 %>%
  filter(!party %in% c("fraktionslos"))

period_19_50_D5 <- period_19_D5 %>%
  filter(!party %in% c("fraktionslos"))

# Running a wordshoal analysis for all 4 periods for speeches from all debates 

period_10 <- period_10 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_10_corpus <- corpus(period_10)
period_10_dfm <- dfm(tokens(period_10_corpus))

wordshoalfit_10 <- 
  textmodel_wordshoal(period_10_dfm, dir = c(7,1),
                      groups = docvars(period_10_corpus, "agenda_unique"), 
                      authors = docvars(period_10_corpus, "party"))
summary(wordshoalfit_10)  

period_14 <- period_14 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_14_corpus <- corpus(period_14)
period_14_dfm <- dfm(tokens(period_14_corpus))

wordshoalfit_14 <- 
  textmodel_wordshoal(period_14_dfm, dir = c(7,1),
                      groups = docvars(period_14_corpus, "agenda_unique"), 
                      authors = docvars(period_14_corpus, "party"))
summary(wordshoalfit_14) 

period_18 <- period_18 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_18_corpus <- corpus(period_18)
period_18_dfm <- dfm(tokens(period_18_corpus))

wordshoalfit_18 <- 
  textmodel_wordshoal(period_18_dfm, dir = c(7,1),
                      groups = docvars(period_18_corpus, "agenda_unique"), 
                      authors = docvars(period_18_corpus, "party"))
summary(wordshoalfit_18)  

period_19 <- period_19 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_19_corpus <- corpus(period_19)
period_19_dfm <- dfm(tokens(period_19_corpus))

wordshoalfit_19 <- 
  textmodel_wordshoal(period_19_dfm, dir = c(7,1),
                      groups = docvars(period_19_corpus, "agenda_unique"), 
                      authors = docvars(period_19_corpus, "party"))
summary(wordshoalfit_19)  

# Running a wordshoal analysis for all 4 periods for speeches from all
# debates for parties that are part of more than 50% of all debates

period_10_50 <- period_10_50 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_10_corpus <- corpus(period_10_50)
period_10_dfm <- dfm(tokens(period_10_corpus))

wordshoalfit_10_50 <- 
  textmodel_wordshoal(period_10_dfm, dir = c(7,1),
                      groups = docvars(period_10_corpus, "agenda_unique"), 
                      authors = docvars(period_10_corpus, "party"))
summary(wordshoalfit_10_50)  

period_14_50 <- period_14_50 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_14_corpus <- corpus(period_14_50)
period_14_dfm <- dfm(tokens(period_14_corpus))

wordshoalfit_14_50 <- 
  textmodel_wordshoal(period_14_dfm, dir = c(7,1),
                      groups = docvars(period_14_corpus, "agenda_unique"), 
                      authors = docvars(period_14_corpus, "party"))
summary(wordshoalfit_14_50) 

period_18_50 <- period_18_50 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_18_corpus <- corpus(period_18_50)
period_18_dfm <- dfm(tokens(period_18_corpus))

wordshoalfit_18_50 <- 
  textmodel_wordshoal(period_18_dfm, dir = c(7,1),
                      groups = docvars(period_18_corpus, "agenda_unique"), 
                      authors = docvars(period_18_corpus, "party"))
summary(wordshoalfit_18_50)  

period_19_50 <- period_19_50 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_19_corpus <- corpus(period_19_50)
period_19_dfm <- dfm(tokens(period_19_corpus))

wordshoalfit_19_50 <- 
  textmodel_wordshoal(period_19_dfm, dir = c(7,1),
                      groups = docvars(period_19_corpus, "agenda_unique"), 
                      authors = docvars(period_19_corpus, "party"))
summary(wordshoalfit_19_50)  

# Running a wordshoal analysis for all 4 periods for speeches from all
# debates with more than 5 speeches 

period_10_D5 <- period_10_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_10_corpus <- corpus(period_10_D5)
period_10_dfm <- dfm(tokens(period_10_corpus))

wordshoalfit_10_D5 <- 
  textmodel_wordshoal(period_10_dfm, dir = c(7,1),
                      groups = docvars(period_10_corpus, "agenda_unique"), 
                      authors = docvars(period_10_corpus, "party"))
summary(wordshoalfit_10_D5)  

period_14_D5 <- period_14_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_14_corpus <- corpus(period_14_D5)
period_14_dfm <- dfm(tokens(period_14_corpus))

wordshoalfit_14_D5 <- 
  textmodel_wordshoal(period_14_dfm, dir = c(7,1),
                      groups = docvars(period_14_corpus, "agenda_unique"), 
                      authors = docvars(period_14_corpus, "party"))
summary(wordshoalfit_14_D5) 

period_18_D5 <- period_18_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_18_corpus <- corpus(period_18_D5)
period_18_dfm <- dfm(tokens(period_18_corpus))

wordshoalfit_18_D5 <- 
  textmodel_wordshoal(period_18_dfm, dir = c(7,1),
                      groups = docvars(period_18_corpus, "agenda_unique"), 
                      authors = docvars(period_18_corpus, "party"))
summary(wordshoalfit_18_D5)  

period_19_D5 <- period_19_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_19_corpus <- corpus(period_19_50)
period_19_dfm <- dfm(tokens(period_19_corpus))

wordshoalfit_19_D5 <- 
  textmodel_wordshoal(period_19_dfm, dir = c(7,1),
                      groups = docvars(period_19_corpus, "agenda_unique"), 
                      authors = docvars(period_19_corpus, "party"))
summary(wordshoalfit_19_D5)  

# Running a wordshoal analysis for all 4 periods for speeches from all
# debates with more than 5 speeches for parties that are part of more than 50%
# of all debates

period_10_50_D5 <- period_10_50_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_10_corpus <- corpus(period_10_50_D5)
period_10_dfm <- dfm(tokens(period_10_corpus))

wordshoalfit_10_50_D5 <- 
  textmodel_wordshoal(period_10_dfm, dir = c(7,1),
                      groups = docvars(period_10_corpus, "agenda_unique"), 
                      authors = docvars(period_10_corpus, "party"))
summary(wordshoalfit_10_50_D5)  

period_14_50_D5 <- period_14_50_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_14_corpus <- corpus(period_14_50_D5)
period_14_dfm <- dfm(tokens(period_14_corpus))

wordshoalfit_14_50_D5 <- 
  textmodel_wordshoal(period_14_dfm, dir = c(7,1),
                      groups = docvars(period_14_corpus, "agenda_unique"), 
                      authors = docvars(period_14_corpus, "party"))
summary(wordshoalfit_14_50_D5) 

period_18_50_D5 <- period_18_50_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_18_corpus <- corpus(period_18_50_D5)
period_18_dfm <- dfm(tokens(period_18_corpus))

wordshoalfit_18_50_D5 <- 
  textmodel_wordshoal(period_18_dfm, dir = c(7,1),
                      groups = docvars(period_18_corpus, "agenda_unique"), 
                      authors = docvars(period_18_corpus, "party"))
summary(wordshoalfit_18_50_D5)  

period_19_50_D5 <- period_19_50_D5 %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_19_corpus <- corpus(period_19_50_D5)
period_19_dfm <- dfm(tokens(period_19_corpus))

wordshoalfit_19_50_D5 <- 
  textmodel_wordshoal(period_19_dfm, dir = c(7,1),
                      groups = docvars(period_19_corpus, "agenda_unique"), 
                      authors = docvars(period_19_corpus, "party"))
summary(wordshoalfit_19_50_D5)  

# Running a wordshoal analysis for all 4 periods for speeches from legislative
# debates with more than 5 speeches for parties that are part of more than 50%
# of all debates

period_10_50_D5_leg <- period_10_50_D5[!is.na(period_10_50_D5$speech_procedure_ID),]
period_14_50_D5_leg <- period_14_50_D5[!is.na(period_14_50_D5$speech_procedure_ID),]
period_18_50_D5_leg <- period_18_50_D5[!period_18_50_D5$speech_procedure_ID == "",]
period_19_50_D5_leg <- period_19_50_D5[!period_19_50_D5$speech_procedure_ID == "",]

period_10_50_D5_leg <- period_10_50_D5_leg %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_10_corpus <- corpus(period_10_50_D5_leg)
period_10_dfm <- dfm(tokens(period_10_corpus))

wordshoalfit_10_50_D5_leg <- 
  textmodel_wordshoal(period_10_dfm, dir = c(7,1),
                      groups = docvars(period_10_corpus, "agenda_unique"), 
                      authors = docvars(period_10_corpus, "party"))
summary(wordshoalfit_10_50_D5_leg)  


period_14_50_D5_leg <- period_14_50_D5_leg %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_14_corpus <- corpus(period_14_50_D5_leg)
period_14_dfm <- dfm(tokens(period_14_corpus))

wordshoalfit_14_50_D5_leg <- 
  textmodel_wordshoal(period_14_dfm, dir = c(7,1),
                      groups = docvars(period_14_corpus, "agenda_unique"), 
                      authors = docvars(period_14_corpus, "party"))
summary(wordshoalfit_14_50_D5_leg) 


period_18_50_D5_leg <- period_18_50_D5_leg %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_18_corpus <- corpus(period_18_50_D5_leg)
period_18_dfm <- dfm(tokens(period_18_corpus))

wordshoalfit_18_50_D5_leg <- 
  textmodel_wordshoal(period_18_dfm, dir = c(7,1),
                      groups = docvars(period_18_corpus, "agenda_unique"), 
                      authors = docvars(period_18_corpus, "party"))
summary(wordshoalfit_18_50_D5_leg)  


period_19_50_D5_leg <- period_19_50_D5_leg %>%
  add_count(agenda_unique) %>% 
  filter(n!=1) %>%
  select(-n)

period_19_corpus <- corpus(period_19_50_D5_leg)
period_19_dfm <- dfm(tokens(period_19_corpus))

wordshoalfit_19_50_D5_leg <- 
  textmodel_wordshoal(period_19_dfm, dir = c(7,1),
                      groups = docvars(period_19_corpus, "agenda_unique"), 
                      authors = docvars(period_19_corpus, "party"))
summary(wordshoalfit_19_50_D5_leg)  

# Saving all wordshoal model output

save(wordshoalfit_10, file = "wordshoalfit_10.RData")
save(wordshoalfit_14, file = "wordshoalfit_14.RData")
save(wordshoalfit_18, file = "wordshoalfit_18.RData")
save(wordshoalfit_19, file = "wordshoalfit_19.RData")

save(wordshoalfit_10_50, file = "wordshoalfit_10_50.RData")
save(wordshoalfit_14_50, file = "wordshoalfit_14_50.RData")
save(wordshoalfit_18_50, file = "wordshoalfit_18_50.RData")
save(wordshoalfit_19_50, file = "wordshoalfit_19_50.RData")

save(wordshoalfit_10_D5, file = "wordshoalfit_10_D5.RData")
save(wordshoalfit_14_D5, file = "wordshoalfit_14_D5.RData")
save(wordshoalfit_18_D5, file = "wordshoalfit_18_D5.RData")
save(wordshoalfit_19_D5, file = "wordshoalfit_19_D5.RData")

save(wordshoalfit_10_50_D5, file = "wordshoalfit_10_75_D5.RData")
save(wordshoalfit_14_50_D5, file = "wordshoalfit_14_75_D5.RData")
save(wordshoalfit_18_50_D5, file = "wordshoalfit_18_75_D5.RData")
save(wordshoalfit_19_50_D5, file = "wordshoalfit_19_75_D5.RData")

save(wordshoalfit_10_50_D5_leg, file = "wordshoalfit_10_75_D5_leg.RData")
save(wordshoalfit_14_50_D5_leg, file = "wordshoalfit_14_75_D5_leg.RData")
save(wordshoalfit_18_50_D5_leg, file = "wordshoalfit_18_75_D5_leg.RData")
save(wordshoalfit_19_50_D5_leg, file = "wordshoalfit_19_75_D5_leg.RData")

### Running a correspondence analysis for Spain - period 10

period_10_corpus <- corpus(period_10_50_D5_leg)
period_10_dfm <- dfm(tokens(period_10_corpus))
dfmat_party_10 <- dfm_group(period_10_dfm, groups = party)

tmod_ca_10 <- textmodel_ca(dfmat_party_10)
textplot_scale1d(tmod_ca_10)

dat_ca_10 <- data.frame(dim1 = coef(tmod_ca_10, doc_dim = 1)$coef_document, 
                     dim2 = coef(tmod_ca_10, doc_dim = 2)$coef_document)

### Running a correspondence analysis for Spain - period 14

period_14_corpus <- corpus(period_14_50_D5_leg)
period_14_dfm <- dfm(tokens(period_14_corpus))
dfmat_party_14 <- dfm_group(period_14_dfm, groups = party)

tmod_ca_14 <- textmodel_ca(dfmat_party_14)
textplot_scale1d(tmod_ca_14)

dat_ca_14 <- data.frame(dim1 = coef(tmod_ca_14, doc_dim = 1)$coef_document, 
                        dim2 = coef(tmod_ca_14, doc_dim = 2)$coef_document)

### Running a correspondence analysis for Germany - period 18

period_18_corpus <- corpus(period_18_50_D5_leg)
period_18_dfm <- dfm(tokens(period_18_corpus))
dfmat_party_18 <- dfm_group(period_18_dfm, groups = party)

tmod_ca_18 <- textmodel_ca(dfmat_party_18)
textplot_scale1d(tmod_ca_18)

dat_ca_18 <- data.frame(dim1 = coef(tmod_ca_18, doc_dim = 1)$coef_document, 
                        dim2 = coef(tmod_ca_18, doc_dim = 2)$coef_document)

### Running a correspondence analysis for Germany - period 19

period_19_corpus <- corpus(period_19_50_D5_leg)
period_19_dfm <- dfm(tokens(period_19_corpus))
dfmat_party_19 <- dfm_group(period_19_dfm, groups = party)

tmod_ca_19 <- textmodel_ca(dfmat_party_19)
textplot_scale1d(tmod_ca_19)

dat_ca_19 <- data.frame(dim1 = coef(tmod_ca_19, doc_dim = 1)$coef_document, 
                        dim2 = coef(tmod_ca_19, doc_dim = 2)$coef_document)